1 install all required packages, including their dependencies

Remove “#” in the below code chunk if you want to run it

#install.packages(c("phyloseq", "ggplot2", "microbiome", "dplyr", "plyr", 
#                   "ggpubr", "vegan", "plotly", "ANCOMBC", "tibble", 
#                   "knitr", "viridis", "lmerTest"))

2 Load the libraries and cured phyloseq object

# Cleaning the global environment:
rm(list=ls())

# Loading libraries:
library(phyloseq)
library(ggplot2)
library(microbiome)
library(dplyr)
library(plyr)
library(ggpubr)
library(vegan)
library(plotly)
library(ggpubr)
library(plotly)
library(ANCOMBC)
library(tibble)
library(lmerTest)
library(tidyverse)
library(mixOmics)

theme_set(theme_classic())

# Setting the working directory:
#setwd("~/Documents/Postdoc/Microbiome course Belgrade 2024/Second day")

# Load the cured phyloseq object:
ps <- readRDS("MPS.16S.triads.mod2.RDS")
ps_2 <- prune_samples(ps@sam_data$Sample_Type!="PostFMT", ps)

Important: ps_2 is a curated and normalized phyloseq object which was created in the previous step.

Taxonomy levels:

# Summary of basic information of phyloseq object:
summarize_phyloseq(ps_2)
## [[1]]
## [1] "1] Min. number of reads = 31640"
## 
## [[2]]
## [1] "2] Max. number of reads = 44945"
## 
## [[3]]
## [1] "3] Total number of reads = 1645521"
## 
## [[4]]
## [1] "4] Average number of reads = 37398.2045454545"
## 
## [[5]]
## [1] "5] Median number of reads = 37625.5"
## 
## [[6]]
## [1] "7] Sparsity = 0.838128020424911"
## 
## [[7]]
## [1] "6] Any OTU sum to 1 or less? YES"
## 
## [[8]]
## [1] "8] Number of singletons = 374"
## 
## [[9]]
## [1] "9] Percent of OTUs that are singletons \n        (i.e. exactly one read detected across all samples)3.17619525242394"
## 
## [[10]]
## [1] "10] Number of sample variables are: 6"
## 
## [[11]]
## [1] "Sample_Type"     "Donor_Sample_ID" "Host_Sex"        "Subject_ID"     
## [5] "Time_Point"      "Age"
colnames(tax_table(ps_2))
## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
# How many family/phyla/genera are there in the phylosec object?
table(tax_table(ps_2)[, "Phylum"], exclude = NULL) 
## 
##     Actinobacteria      Bacteroidetes      Cyanobacteria Epsilonbacteraeota 
##                225                545                 14                  1 
##      Euryarchaeota         Firmicutes       Fusobacteria Kiritimatiellaeota 
##                  8               2057                  7                  1 
##      Lentisphaerae    Patescibacteria     Proteobacteria      Synergistetes 
##                  7                  2                 80                  5 
##        Tenericutes    Verrucomicrobia               <NA> 
##                 21                 15                  3
sort(table(tax_table(ps_2)[, "Phylum"], exclude = NULL), decreasing = TRUE)
## 
##         Firmicutes      Bacteroidetes     Actinobacteria     Proteobacteria 
##               2057                545                225                 80 
##        Tenericutes    Verrucomicrobia      Cyanobacteria      Euryarchaeota 
##                 21                 15                 14                  8 
##       Fusobacteria      Lentisphaerae      Synergistetes               <NA> 
##                  7                  7                  5                  3 
##    Patescibacteria Epsilonbacteraeota Kiritimatiellaeota 
##                  2                  1                  1
# How many unique family/phyla/genera does this data set have?
length(unique(tax_table(ps_2)[, "Phylum"]))
## [1] 15
# 15 or 14???

3 Filtering steps

# Excluding the Phylum=NA:
ps_3 <- subset_taxa(ps_2, !is.na(Phylum))
sum(is.na(ps_3@tax_table[,"Phylum"])==T)
## [1] 0
table(ps_3@tax_table[,"Phylum"],  useNA = "ifany")
## 
##     Actinobacteria      Bacteroidetes      Cyanobacteria Epsilonbacteraeota 
##                225                545                 14                  1 
##      Euryarchaeota         Firmicutes       Fusobacteria Kiritimatiellaeota 
##                  8               2057                  7                  1 
##      Lentisphaerae    Patescibacteria     Proteobacteria      Synergistetes 
##                  7                  2                 80                  5 
##        Tenericutes    Verrucomicrobia 
##                 21                 15
prevdf = apply(X = otu_table(ps_3), MARGIN = ifelse(taxa_are_rows(ps_3), yes = 1, no = 2), FUN = function(x){sum(x > 0)})

# Add taxonomy and total read counts to this data.frame
prevdf <- data.frame(Prevalence = prevdf, TotalAbundance = taxa_sums(ps_3), tax_table(ps_3))

plyr::ddply(prevdf, "Phylum", function(df1){cbind(mean(df1$Prevalence),sum(df1$Prevalence))})
##                Phylum         1     2
## 1      Actinobacteria 5.9377778  1336
## 2       Bacteroidetes 4.8935780  2667
## 3       Cyanobacteria 2.0000000    28
## 4  Epsilonbacteraeota 0.0000000     0
## 5       Euryarchaeota 4.3750000    35
## 6          Firmicutes 8.1137579 16690
## 7        Fusobacteria 0.8571429     6
## 8  Kiritimatiellaeota 0.0000000     0
## 9       Lentisphaerae 2.0000000    14
## 10    Patescibacteria 3.5000000     7
## 11     Proteobacteria 4.7000000   376
## 12      Synergistetes 1.4000000     7
## 13        Tenericutes 1.7142857    36
## 14    Verrucomicrobia 6.2000000    93
# Subset to the remaining phyla
prevdf1 = subset(prevdf, Phylum %in% get_taxa_unique(ps_3, "Phylum"))

# Visualization:
ggplot(prevdf1, aes(TotalAbundance, Prevalence / nsamples(ps_3),color=Phylum)) +
  # Include a guess for parameter
  geom_hline(yintercept = 0.05, alpha = 0.5, linetype = 2) +
  geom_point(size = 2, alpha = 0.7) + scale_color_viridis_d(option = "plasma") +
  scale_x_log10() +  xlab("Total Abundance") + ylab("Prevalence [Frac. Samples]") +
  facet_wrap(~Phylum) + theme(legend.position="none")

# Filtering based on the prevalence:
prevalenceThreshold = 0.05 * nsamples(ps_3) ###### considers taxa present in at last 5% of all samples
prevalenceThreshold
## [1] 2.2
keepTaxa = rownames(prevdf)[(prevdf$Prevalence >= prevalenceThreshold)]
ps_4 = prune_taxa(keepTaxa, ps_3)

# Use phyloseq's built-in functions for filtering:
ps_5 <- filter_taxa(ps_3, function(x) sum(x > 0) >= (0.05 * nsamples(ps_3)), prune = TRUE)

# ps_4 = ps_5, it keeps only taxa that appear in at least 5% of all samples.
ntaxa(ps_3)
## [1] 2988
ntaxa(ps_4)
## [1] 1607
ntaxa(ps_5)
## [1] 1607
# filter based on a certain certain family/phyla/genera:
ps_3_firmicutes <- subset_taxa(ps_3, Phylum == "Firmicutes")

# filter based on certain group/s (males for example)
ps_3_males <- subset_samples(ps_3, Host_Sex == "Male")

4 Bar plots and Heatmaps

plot_bar(ps_4,fill="Phylum") + 
  theme(legend.position="bottom") + 
  theme(axis.title.x=element_blank(),  axis.text.x=element_blank()) 

For a better visualization we can normalize the libraries so that every sample has similar (relative) abundances (range between 0 and 1). there are multiple ways to transform data and get relative abundances:

ps_4_RA <- transform_sample_counts(ps_4, function(x) 100 * x/sum(x)) 
ps_4_RA <- microbiome::transform(ps_4,"compositional")

plot of relative abundances:

plot_bar(ps_4_RA,fill='Phylum') + 
  theme(legend.position="bottom") + 
  theme(axis.title.x=element_blank(),  axis.text.x=element_blank()) 

# group by Sample_Type:
plot_bar(ps_4_RA,fill="Phylum") + 
  theme(legend.position="bottom") + 
  theme(axis.title.x=element_blank(),  axis.text.x=element_blank()) +
  facet_grid(~Sample_Type, scales="free_x")

# Average per group:
phylumGlommed = tax_glom(ps_4_RA, "Phylum") 

plot_composition(phylumGlommed, average_by = "Sample_Type", transform = "compositional") 

# Does the labeling/legend match with with the phyla names?
# to find the related phylum names:
tax_table(phylumGlommed)
## Taxonomy Table:     [11 taxa by 7 taxonomic ranks]:
##           Kingdom    Phylum            Class Order Family Genus Species
## ASVX_3    "Bacteria" "Firmicutes"      NA    NA    NA     NA    NA     
## ASVX_846  "Bacteria" "Tenericutes"     NA    NA    NA     NA    NA     
## ASVX_1695 "Bacteria" "Synergistetes"   NA    NA    NA     NA    NA     
## ASVX_112  "Archaea"  "Euryarchaeota"   NA    NA    NA     NA    NA     
## ASVX_152  "Bacteria" "Verrucomicrobia" NA    NA    NA     NA    NA     
## ASVX_1019 "Bacteria" "Lentisphaerae"   NA    NA    NA     NA    NA     
## ASVX_139  "Bacteria" "Proteobacteria"  NA    NA    NA     NA    NA     
## ASVX_69   "Bacteria" "Bacteroidetes"   NA    NA    NA     NA    NA     
## ASVX_2355 "Bacteria" "Patescibacteria" NA    NA    NA     NA    NA     
## ASVX_273  "Bacteria" "Cyanobacteria"   NA    NA    NA     NA    NA     
## ASVX_20   "Bacteria" "Actinobacteria"  NA    NA    NA     NA    NA
# changing the labels in the legend:
colnames(phylumGlommed@otu_table) <- phylumGlommed@tax_table[,2]

# plot again:
plot_composition(phylumGlommed, average_by = "Sample_Type", transform = "compositional", legend = phylanames) 

Now let’s only look at “Firmicutes” phyla:

# subset the Firmicutes phyla:
ps_3_Firmicutes = subset_taxa(ps_3, Phylum== "Firmicutes")

# plot_abundance function:
plot_abundance = function(physeq,title = "",Facet = "Family", Color = "Family"){
 mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > 0)
  ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
                                              color = Color, fill = Color)) +
    geom_violin(fill = NA) +
    geom_point(size = 1, alpha = 0.3,
               position = position_jitter(width = 0.3)) +
    facet_wrap(facets = Facet) + scale_y_log10()+
    theme(legend.position="none")
}

# look at each family in Firmicutes phyla:
plot_abundance(ps_3_Firmicutes,"") 

# how if we want to look at a specific family like "Lachnospiraceae":
ps_3_Lachnospiraceae = subset_taxa(ps_2, Family == "Lachnospiraceae")

plot_abundance = function(physeq,title = "",Facet = "Genus", Color = "Genus"){
 mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > 0)
ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
color = Color, fill = Color)) + geom_violin(fill = NA) + geom_point(size = 1, alpha = 0.3,position = position_jitter(width = 0.3)) +  facet_wrap(facets = Facet) + scale_y_log10()+ theme(legend.position="none") }

plot_abundance(ps_3_Lachnospiraceae,"") 

5 Normalization

A much debated topic in microbiome data analysis is how to account for uneven sampling depth. Differences in sampling depth introduce a bias which needs to be accounted for. For instance, considering alpha diversity you could expect a strong correlation between observed species and sampling depth. One approach would be to subsample, (rarefy the data) in such a way that all samples would have equal sampling depth. However, some consider this bad practice as data is thrown away. Another issue is the loss of samples when a threshold higher than the actual sampling depth is set. If data is not normalized on the count table, sampling depth needs to be accounted for in the statistical test.

Others however have shown that not subsampling your data still results in bias. In the end one needs to consider the trade off between reporting of a false significant association (Type 1 error) driven by a potential sampling bias, or missing significant associations (Type 2 error) because of reduced statistical power that comes from data sub sampling and sample drop out.

Since statistical analysis are complex enough as it is, we prefer to err on the side of caution and recommend sub sampling the data prior to any analysis unless study design integrity is compromised. Here we proceed with normalization by a single step sub sampling of the data since we believe Type 1 errors are worse than Type 2.

** What would be an appropriate sampling depth for this dataset? **

sample_sums(ps_2) %>% sort()
## 2018_0007_00_SA508SA705 2018_0007_00_SB501SA702 2018_0007_00_SB501SA705 
##                   31640                   32083                   32086 
## 2018_0007_00_SB505SA706 2018_0007_00_SA502SA708 2018_0007_00_SB504SA708 
##                   32316                   32493                   32830 
## 2018_0007_00_SA501SA703 2018_0007_00_SB503SA712 2018_0007_00_SB503SA708 
##                   32839                   33623                   33834 
## 2018_0007_00_SA507SA708 2018_0007_00_SA504SA704 2018_0007_00_SA504SB705 
##                   33959                   34004                   34079 
## 2018_0007_00_SA505SA705 2018_0007_00_SA505SA711 2018_0007_00_SA506SA711 
##                   34218                   34705                   35056 
## 2018_0007_00_SB503SA704 2018_0007_00_SA508SB710 2018_0007_00_SA502SA707 
##                   35385                   35598                   35732 
## 2018_0007_00_SA505SB705 2018_0007_00_SA505SB709 2018_0007_00_SA502SB706 
##                   35767                   37095                   37376 
## 2018_0007_00_SA508SB712 2018_0007_00_SA506SA707 2018_0007_00_SA504SA711 
##                   37469                   37782                   37873 
## 2018_0007_00_SB507SA705 2018_0007_00_SA503SA708 2018_0007_00_SA505SA706 
##                   38589                   38919                   39060 
## 2018_0007_00_SA502SB712 2018_0007_00_SA507SB712 2018_0007_00_SA503SB707 
##                   39198                   39361                   39389 
## 2018_0007_00_SA502SB708 2018_0007_00_SA503SA705 2018_0007_00_SB504SA711 
##                   39547                   39549                   39705 
## 2018_0007_00_SA506SB701 2018_0007_00_SA507SA701 2018_0007_00_SA502SB703 
##                   40278                   40384                   40760 
## 2018_0007_00_SA503SA707 2018_0007_00_SA507SB703 2018_0007_00_SA505SB703 
##                   40777                   40843                   41100 
## 2018_0007_00_SA507SA709 2018_0007_00_SB508SA702 2018_0007_00_SA507SB702 
##                   41434                   43023                   44265 
## 2018_0007_00_SA506SB703 2018_0007_00_SA507SB711 
##                   44553                   44945
ps_2 <- rarefy_even_depth(ps_2, sample.size = 20000,  rngseed = 211202)

6 Alpha Diversity

There are different functions from different packages (phyloseq, vegan, microbiome) to calculate alpha diversity
Here we use microbiome package:

# check if any ASVs/taxa are not present in any of the samples
any(taxa_sums(ps_2)==0) 
## [1] FALSE
# prune dataset for taxa that are not present in any of the samples
# ps_2 <- prune_taxa(taxa_sums(ps_2)>0, ps_2)

# check how many ASVs/taxa are kept
ntaxa(ps_2)
## [1] 2610
# check the taxonomic rank information
rank_names(ps_2)
## [1] "Kingdom" "Phylum"  "Class"   "Order"   "Family"  "Genus"   "Species"
# check samples with low sequencing depth <1000
sample_sums(ps_2) < 1000
## 2018_0007_00_SA501SA703 2018_0007_00_SA502SA707 2018_0007_00_SA502SA708 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA502SB703 2018_0007_00_SA502SB706 2018_0007_00_SA502SB708 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA502SB712 2018_0007_00_SA503SA705 2018_0007_00_SA503SA707 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA503SA708 2018_0007_00_SA503SB707 2018_0007_00_SA504SA704 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA504SA711 2018_0007_00_SA504SB705 2018_0007_00_SA505SA705 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA505SA706 2018_0007_00_SA505SA711 2018_0007_00_SA505SB703 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA505SB705 2018_0007_00_SA505SB709 2018_0007_00_SA506SA707 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA506SA711 2018_0007_00_SA506SB701 2018_0007_00_SA506SB703 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA507SA701 2018_0007_00_SA507SA708 2018_0007_00_SA507SA709 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA507SB702 2018_0007_00_SA507SB703 2018_0007_00_SA507SB711 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA507SB712 2018_0007_00_SA508SA705 2018_0007_00_SA508SB710 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SA508SB712 2018_0007_00_SB501SA702 2018_0007_00_SB501SA705 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SB503SA704 2018_0007_00_SB503SA708 2018_0007_00_SB503SA712 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SB504SA708 2018_0007_00_SB504SA711 2018_0007_00_SB505SA706 
##                   FALSE                   FALSE                   FALSE 
## 2018_0007_00_SB507SA705 2018_0007_00_SB508SA702 
##                   FALSE                   FALSE
range(sample_sums(ps_2))
## [1] 20000 20000
# remove samples with low sequencing depth: (we do not have it here)
#ps_2 <- subset_samples(ps_2, sample_sums(ps_2) >1000)

# from microbiome package you can calculate all alpha diversity indexes or specific ones:
#ps_2_alpha <- microbiome::alpha(ps_2, index = "observed")
ps_2_alpha <- microbiome::alpha(ps_2, index = "all")
str(ps_2_alpha)
## 'data.frame':    44 obs. of  22 variables:
##  $ observed                  : num  399 453 393 396 371 418 406 459 321 435 ...
##  $ chao1                     : num  452 479 419 418 391 ...
##  $ diversity_inverse_simpson : num  41 73.7 28.8 32.6 47.1 ...
##  $ diversity_gini_simpson    : num  0.976 0.986 0.965 0.969 0.979 ...
##  $ diversity_shannon         : num  4.46 4.92 4.26 4.4 4.52 ...
##  $ diversity_fisher          : num  70.6 82.4 69.3 70 64.7 ...
##  $ diversity_coverage        : num  15 27 11 12 17 15 14 21 14 16 ...
##  $ evenness_camargo          : num  0.689 0.729 0.859 0.714 0.729 ...
##  $ evenness_pielou           : num  0.744 0.804 0.714 0.736 0.764 ...
##  $ evenness_simpson          : num  0.1028 0.1628 0.0733 0.0824 0.127 ...
##  $ evenness_evar             : num  0.223 0.241 0.239 0.253 0.237 ...
##  $ evenness_bulla            : num  0.336 0.391 0.324 0.357 0.348 ...
##  $ dominance_dbp             : num  0.0736 0.0387 0.1221 0.092 0.0649 ...
##  $ dominance_dmn             : num  0.1301 0.0741 0.1881 0.1658 0.1134 ...
##  $ dominance_absolute        : num  1473 775 2442 1839 1298 ...
##  $ dominance_relative        : num  0.0736 0.0387 0.1221 0.092 0.0649 ...
##  $ dominance_simpson         : num  0.0244 0.0136 0.0347 0.0306 0.0212 ...
##  $ dominance_core_abundance  : num  0.444 0.33 0.233 0.521 0.379 ...
##  $ dominance_gini            : num  0.971 0.958 0.974 0.969 0.97 ...
##  $ rarity_log_modulo_skewness: num  2.06 2.05 2.06 2.06 2.05 ...
##  $ rarity_low_abundance      : num  0.138 0.179 0.152 0.172 0.156 ...
##  $ rarity_rare_abundance     : num  0.344 0.361 0.614 0.292 0.313 ...
# extract metadata from phyloseq object (ps_2)
ps_2_meta <- meta(ps_2)
class(ps_2_meta)
## [1] "data.frame"
# merge the alpha diversity indexe/s with metadata (which is now a data frame)
rownames(ps_2_alpha) == rownames(ps_2_meta)
##  [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [16] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## [31] TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE TRUE
merged_df <- cbind(ps_2_meta,ps_2_alpha)
#colnames(merged_df)

# Shannon:
p1 <- ggviolin(merged_df, x = "Sample_Type", y = "diversity_shannon",
               add = "boxplot", fill = "Sample_Type")
print(p1)

# looking at the distribution of the variable to choose the best test for comparing:
summary(merged_df$diversity_shannon)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.753   4.343   4.529   4.546   4.776   5.131
hist(merged_df$diversity_shannon)

ggqqplot(merged_df$diversity_shannon)

shapiro.test(merged_df$diversity_shannon)
## 
##  Shapiro-Wilk normality test
## 
## data:  merged_df$diversity_shannon
## W = 0.9868, p-value = 0.8891
# is diversity_shannon (Shannon index) following a normal distribution or not?
# if normal: parametric tests are recommended
t.test(merged_df$diversity_shannon~Sample_Type,data=merged_df, var.equal = T)
## 
##  Two Sample t-test
## 
## data:  merged_df$diversity_shannon by Sample_Type
## t = 1.4618, df = 42, p-value = 0.1512
## alternative hypothesis: true difference in means between group PreFMT and group Donor is not equal to 0
## 95 percent confidence interval:
##  -0.05820449  0.36412339
## sample estimates:
## mean in group PreFMT  mean in group Donor 
##             4.587469             4.434510
var.test(merged_df$diversity_shannon~Sample_Type,data=merged_df)
## 
##  F test to compare two variances
## 
## data:  merged_df$diversity_shannon by Sample_Type
## F = 2.393, num df = 31, denom df = 11, p-value = 0.1265
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.7693335 5.8424866
## sample estimates:
## ratio of variances 
##           2.392961
# if not normal: non-parametric tests are recommended
wilcox.test(diversity_shannon ~ Sample_Type, data = merged_df, paired = FALSE)
## 
##  Wilcoxon rank sum exact test
## 
## data:  diversity_shannon by Sample_Type
## W = 254, p-value = 0.1057
## alternative hypothesis: true location shift is not equal to 0
# Univariable and multivariate alpha-diversity comparisson:
# Unadjusted:
Shannon_model1 <- lm(merged_df$diversity_shannon~merged_df$Sample_Type)
summary(Shannon_model1)
## 
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83481 -0.17242 -0.00933  0.21585  0.54324 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.58747    0.05464  83.951   <2e-16 ***
## merged_df$Sample_TypeDonor -0.15296    0.10464  -1.462    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3091 on 42 degrees of freedom
## Multiple R-squared:  0.04842,    Adjusted R-squared:  0.02576 
## F-statistic: 2.137 on 1 and 42 DF,  p-value: 0.1512
# Adjusted: Let's say we want to adjust for Age ("Age") and sex ("Host_Sex"):
Shannon_model2 <- lm(merged_df$diversity_shannon ~ merged_df$Sample_Type +
                       merged_df$Host_Sex + merged_df$Age)
summary(Shannon_model2)
## 
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type + 
##     merged_df$Host_Sex + merged_df$Age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82289 -0.15304 -0.00487  0.22079  0.54041 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 3.84869    0.64745   5.944 5.66e-07 ***
## merged_df$Sample_TypeDonor  0.06438    0.23418   0.275    0.785    
## merged_df$Host_SexMale      0.10744    0.09801   1.096    0.280    
## merged_df$Age               0.01475    0.01416   1.042    0.304    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3081 on 40 degrees of freedom
## Multiple R-squared:  0.09949,    Adjusted R-squared:  0.03195 
## F-statistic: 1.473 on 3 and 40 DF,  p-value: 0.2364
# Richness (Observed taxa)
p2 <- ggviolin(merged_df, x = "Sample_Type", y = "observed",
               add = "boxplot", fill = "Sample_Type")
print(p2)

# looking at the distribution of the variable to choose the best test for comparing:
summary(merged_df$observed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   270.0   396.0   438.0   438.8   471.0   580.0
hist(merged_df$observed)

ggqqplot(merged_df$observed)

shapiro.test(merged_df$observed)
## 
##  Shapiro-Wilk normality test
## 
## data:  merged_df$observed
## W = 0.97546, p-value = 0.4635
# is Richness (Observed taxa) following a normal distribution or not?
# if normal: parametric tests are recommended
t.test(merged_df$observed~Sample_Type,data=merged_df, var.equal = T)
## 
##  Two Sample t-test
## 
## data:  merged_df$observed by Sample_Type
## t = 0.94214, df = 42, p-value = 0.3515
## alternative hypothesis: true difference in means between group PreFMT and group Donor is not equal to 0
## 95 percent confidence interval:
##  -24.18459  66.53875
## sample estimates:
## mean in group PreFMT  mean in group Donor 
##             444.5938             423.4167
var.test(merged_df$observed~Sample_Type,data=merged_df)
## 
##  F test to compare two variances
## 
## data:  merged_df$observed by Sample_Type
## F = 2.0086, num df = 31, denom df = 11, p-value = 0.2201
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.6457505 4.9039701
## sample estimates:
## ratio of variances 
##           2.008564
# if not normal: non-parametric tests are recommended
wilcox.test(observed ~ Sample_Type, data = merged_df, paired = FALSE)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  observed by Sample_Type
## W = 233.5, p-value = 0.2798
## alternative hypothesis: true location shift is not equal to 0
# Univariable and multivariate alpha-diversity comparisson:
# Unadjusted:
Richness_model1 <- lm(merged_df$observed~merged_df$Sample_Type)
summary(Shannon_model1)
## 
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.83481 -0.17242 -0.00933  0.21585  0.54324 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.58747    0.05464  83.951   <2e-16 ***
## merged_df$Sample_TypeDonor -0.15296    0.10464  -1.462    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3091 on 42 degrees of freedom
## Multiple R-squared:  0.04842,    Adjusted R-squared:  0.02576 
## F-statistic: 2.137 on 1 and 42 DF,  p-value: 0.1512
# Adjusted: Let's say we want to adjust for Age ("Age") and sex ("Host_Sex"):
Richness_model2 <- lm(merged_df$observed ~ merged_df$Sample_Type +
                       merged_df$Age + merged_df$Host_Sex)
summary(Shannon_model2)
## 
## Call:
## lm(formula = merged_df$diversity_shannon ~ merged_df$Sample_Type + 
##     merged_df$Host_Sex + merged_df$Age)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82289 -0.15304 -0.00487  0.22079  0.54041 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 3.84869    0.64745   5.944 5.66e-07 ***
## merged_df$Sample_TypeDonor  0.06438    0.23418   0.275    0.785    
## merged_df$Host_SexMale      0.10744    0.09801   1.096    0.280    
## merged_df$Age               0.01475    0.01416   1.042    0.304    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3081 on 40 degrees of freedom
## Multiple R-squared:  0.09949,    Adjusted R-squared:  0.03195 
## F-statistic: 1.473 on 3 and 40 DF,  p-value: 0.2364

So based on the results there is no significant difference between the two groups (Rd increased versus decreased) in alpha diversity using Richness and Shannon index

**Note: Many papers are also using Chao1 index to determine richness levels. However, some of us consider this metric inappropriate for (amplicon) sequence data as it tends to extrapolate sequencing error rather than true richness.

7 Beta diversity

Beta diversity describes the variation in species composition between samples, often expressed as the “distance” or “dissimilarity” between them. The choice of distance metric depends on the context and goals of the analysis, as different metrics capture different aspects of community composition.

For example, Bray-Curtis and Jaccard distances treat each species or feature as a distinct unit, without considering relationships between species. These metrics only focus on shared presence or abundance between samples. In contrast, UniFrac distances account for the evolutionary relationships (phylogeny) between species. With UniFrac, samples with phylogenetically distinct species show higher distances than samples with species that are different but closely related. This makes UniFrac especially valuable when genetic or evolutionary relatedness is meaningful in the analysis.

Another key distinction is whether the distance metric considers feature abundance. Some methods are weighted (e.g., weighted UniFrac and Bray-Curtis), which incorporate the relative abundance of species in the calculation. Others are unweighted (e.g., Jaccard and unweighted UniFrac), which treat species as simply present or absent. Weighted metrics capture differences in both composition and abundance, while unweighted metrics focus solely on the presence or absence of species, providing a simpler, binary view of beta diversity.

Most common distance metrics used are:

bray: Bray-Curtis

unifrac: unweighted UniFrac distance

wunifrac: weighted-UniFrac distance

dpcoa: sample-wise distance used in Double Principle Coordinate Analysis

jsd: Jensen-Shannon Divergence

The obtained distance matrices are used both for visualization and statistical testing.

7.1 Ordination

In a static plot on a screen, we are typically limited to only two dimensions (x and y), which constrains our ability to visualize high-dimensional data. Ordination is a set of methods designed to reduce the dimensionality of complex datasets for the purpose of visualization, simplification, and interpretation. By reducing data from multiple dimensions to just two or three, ordination enables researchers to visualize and interpret underlying patterns, trends, or gradients that would otherwise be difficult to discern.

Ordination seeks to simplify the data by finding new dimensions, often called axes, components, or factors, that capture the greatest amount of variance (i.e., variation or information) in the dataset. The first axis is chosen to capture as much variance as possible in the data, and subsequent axes capture the remaining variance, each being orthogonal (or perpendicular) to the previous ones. This orthogonality ensures that each axis is independent and does not overlap in the information it represents. Through this process, high-dimensional relationships are projected into a lower-dimensional space in a way that retains as much of the original data’s structure as possible.

To achieve maximum variance on each axis, ordination techniques often involve mathematical transformations that “rotate” the data in its original multidimensional space. This rotation aligns the data along new axes that maximize distinct sources of variation, making it easier to identify meaningful ecological gradients or patterns.

A simple example of ordination is a 3d plot. While 3 dimensions can be viewed, the actual image is a projection in 2D.

p <- plot_ly(data = ps_2@otu_table@.Data %>% data.frame(), 
             x = ~ASVX_1, y = ~ASVX_2, z = ~ASVX_3, 
             color = ~ASVX_1,
             type = "scatter3d", 
             mode = "markers")
p

For performing the ordination we use ordinate() function in phyloseq which does both distance calculation and the ordination: Different ordination Constrained and unconstrained methods are available.

1- method (Optional). A character string. Default is “DCA”.

Currently supported method options are: c(“DCA”, “CCA”, “RDA”, “CAP”, “DPCoA”, “NMDS”, “MDS”, “PCoA”)

DCA: Performs detrended correspondence analysis usingdecorana

CCA: Performs correspondence analysis, or optionally, constrained correspondence analysis (a.k.a. canonical correspondence analysis), via cca

RDA: Performs redundancy analysis, if no contrainsts are given a principal components analysis is performed, via rda

CAP: [Partial] Constrained Analysis of Principal Coordinates or distance-based RDA, via capscale. See capscale.phyloseq for more details. In particular, a formula argument must be provided.

DPCoA: Performs Double Principle Coordinate Analysis using a (corrected, if necessary) phylogenetic/patristic distance between species. The calculation is performed by DPCoA(), which ultimately uses dpcoa after making the appropriate accessions/corrections of the data.

NMDS: Performs Non-metric MultiDimenstional Scaling of a sample-wise ecological distance matrix onto a user-specified number of axes, k. By default, k=2, but this can be modified as a supplementary argument. This method is ultimately carried out by metaMDS after the appropriate accessions and distance calculations. Because metaMDS includes its own distance calculation wrappers to vegdist, and these provide additional functionality in the form of species scores, ordinate will pass-on the distance argument to metaMDS if it is among the supported vegdist methods. However, all distance methods supported by distance are supported here, including “unifrac” (the default) and “DPCoA”.

MDS/PCoA: Performs principal coordinate analysis (also called principle coordinate decomposition, multidimensional scaling (MDS), or classical scaling) of a distance matrix (Gower 1966), including two correction methods for negative eigenvalues. See pcoa for further details.

Lets visualize the betadiversity using the most common distance metric Bray-Curtis and PCoA (MDS) ordination.

# generate distance metrics: (MDS (“PCoA”) on Bray-Curtis)
ord_ps_2_bray <- ordinate(ps_2, "MDS", "bray")

# generate the plot
p1_bray <- plot_ordination(ps_2, ord_ps_2_bray, type="samples", 
                           color="Sample_Type", title="Samples",
                           axes = 1:2, shape = NULL) +
  geom_point(size=3)
print(p1_bray)

## generate a 3D plot:
# Extract ordination scores
ord_scores_bray <- as.data.frame(ord_ps_2_bray$vectors)

# Add metadata to the scores
ord_scores_bray$Sample_Type <- sample_data(ps_2)$Sample_Type

# collect only the first 3 Axis and variable of interest (Sample_Type):
ord_scores_bray_3d <- ord_scores_bray[, c("Axis.1", "Axis.2", "Axis.3", "Sample_Type")]
colnames(ord_scores_bray_3d)[1:3] <- c("PCoA1", "PCoA2", "PCoA3")

p <- plot_ly(data = ord_scores_bray_3d, 
             x = ~PCoA1, y = ~PCoA2, z = ~PCoA3, 
             color = ~Sample_Type,
             type = "scatter3d", 
             mode = "markers")
p <- p %>% layout(scene = list(xaxis = list(title = 'PCoA1'),
                               yaxis = list(title = 'PCoA2'),
                               zaxis = list(title = 'PCoA3')),
                  title = "3D Bray-Curtis Ordination")
p
# calculate and generate multiple plots using one distance metrics (bray-curtis) and 
# multiple methods: 
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "MDS")
plist = llply(as.list(ord_meths), function(i, physeq, dist){
  ordi = ordinate(physeq, method=i, distance=dist)
  plot_ordination(physeq, ordi, type="samples", color="Sample_Type")
}, ps_2, dist)

names(plist) <- ord_meths
pdataframe = ldply(plist, function(x){
  df = x$data[, 1:2]
  colnames(df) = c("Axis_1", "Axis_2")
  return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"

p = ggplot(pdataframe, aes(Axis_1, Axis_2, color=Sample_Type))
p = p + geom_point(size=3)
p = p + facet_wrap(~method, scales="free")
p

# look at each one:
plist[[4]]  

################################################################################
# MDS (“PCoA”) on weighted Unifrac Distances:
ord_ps_2_WUni <- ordinate(ps_2, "PCoA", "unifrac", weighted=TRUE)
plot_ordination(ps_2, ord_ps_2_WUni, color="Sample_Type", shape=NULL)

# biplots for both samples and taxas:
plot_ordination(ps_2, ord_ps_2_WUni, type="biplot", color="Phylum", shape="Sample_Type")

# splited plots for samples and taxa:
plot_ordination(ps_2, ord_ps_2_WUni, type="split", color="Phylum", shape="Sample_Type")

It is bad practice to include the outcome in a constrained ordination. Due to the high dimensionality of the data there is usually some hyperplane that can seperate groups very well, even though no significant difference exists.

ord <- ordinate(ps_2, method = "RDA", formula = ~Sample_Type, distance = "bray")
plot_ordination(physeq = ps_2, ordination = ord, color="Sample_Type")

However, if we shuffle our parameter we can still obtain good seperation even though the association is now complete random.

ps_2@sam_data$Sample_Type_shuffle <- sample(ps_2@sam_data$Sample_Type)
ord <- ordinate(ps_2, method = "RDA", formula = ~Sample_Type_shuffle, distance = "bray")
plot_ordination(physeq = ps_2, ordination = ord, color="Sample_Type_shuffle")

8 PERMANOVA (Permutational Multivariate Analysis of Variance)

PERMANOVA uses distance matrices calculated from beta diversity measures (e.g., Bray-Curtis, Jaccard, UniFrac) as input. PERMANOVA can incorporate multiple explanatory variables and their interactions. So confounders can be considered. PERMANOVA is a non-parametric test, meaning it doesn’t assume normal distribution of the data.

# PERMANOVA using Bray-Curtis (method="bray") distance metrics
permanova1 <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Sample_Type"]], 
                      permutations=999, method = "bray")
permanova1
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = ps_2@otu_table ~ ps_2@sam_data[["Sample_Type"]], permutations = 999, method = "bray")
##                                Df SumOfSqs      R2      F Pr(>F)  
## ps_2@sam_data[["Sample_Type"]]  1   0.3164 0.03269 1.4195  0.038 *
## Residual                       42   9.3607 0.96731                
## Total                          43   9.6771 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#permanova1_adj <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Sample_Type"]] +
#                          ps_2@sam_data [["Age"]] +
#                          ps_2@sam_data [["Host_Sex"]], permutations=999,
#                          method = "bray")

permanova1_adj <- adonis2(ps_2@otu_table ~ ps_2@sam_data [["Age"]] +
                          ps_2@sam_data [["Host_Sex"]] +
                          ps_2@sam_data [["Sample_Type"]], permutations=999,
                          method = "bray")
permanova1_adj
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = ps_2@otu_table ~ ps_2@sam_data[["Age"]] + ps_2@sam_data[["Host_Sex"]] + ps_2@sam_data[["Sample_Type"]], permutations = 999, method = "bray")
##                                Df SumOfSqs      R2      F Pr(>F)  
## ps_2@sam_data[["Age"]]          1   0.3268 0.03377 1.4655  0.024 *
## ps_2@sam_data[["Host_Sex"]]     1   0.2448 0.02530 1.0979  0.267  
## ps_2@sam_data[["Sample_Type"]]  1   0.1850 0.01912 0.8296  0.816  
## Residual                       40   8.9204 0.92181                
## Total                          43   9.6771 1.00000                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Attention: the order of covariates matter because the covariates’ order can change the proportion of the variation that a variable can have on the microbiome composition.

It is recommended to put your variable of interest at the end of the formula. This way variance that is associated with confounders is attributed to these rather the variable of interest.

9 Differential abundance analysis

Determining differential abundance of microbes might seem trivial, but results show that outcomes of these analysis can be very stochastic and spurious depending on the methods used r Langille Lahti.

9.1 ANCOMBC

Analysis of Compositions of Microbiomes with Bias Correction (ANCOM-BC2)

ANCOMBC2() from ANCOMBC package can run on different Taxonomy levels (Phylum, Family, Genus, etc)

ANCOM-BC can handle multiple covariates and multiple test correction. Hence it is possible to add confounders as covariates in the formula.

# Phylum level
ancom1 = ancombc(data = ps_2, assay_name = "counts", 
              tax_level = "Phylum", phyloseq = NULL, 
              formula = "Age + Host_Sex + Sample_Type", 
              p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000, 
              group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, 
              max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
              n_cl = 1, verbose = TRUE)
#ancom1[["res"]]

# is there any phylum differ between the groups?
ancom1[["res"]]$diff_abn$taxon [ancom1[["res"]]$diff_abn$Sample_Type==TRUE]
## [1] "Phylum:Verrucomicrobia" "Phylum:Lentisphaerae"   "Phylum:Patescibacteria"
# Family level
ancom2 = ancombc(data = ps_2, assay_name = "counts", 
              tax_level = "Family", phyloseq = NULL, 
              formula = "Age + Host_Sex + Sample_Type", 
              p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000, 
              group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, 
              max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
              n_cl = 1, verbose = TRUE)
#ancom2[["res"]]

# is there any family differ between the groups?
ancom2[["res"]]$diff_abn$taxon [ancom2[["res"]]$diff_abn$Sample_Type==TRUE]
## [1] "Family:Clostridiales_vadinBB60_group"
## [2] "Family:Family_XI"                    
## [3] "Order:Izimaplasmatales"              
## [4] "Order:Mollicutes_RF39"               
## [5] "Family:Succinivibrionaceae"          
## [6] "Order:Bacteroidales"                 
## [7] "Family:Saccharimonadaceae"
# generating relative abundances:
ps_2_RA <- transform_sample_counts(ps_2, function(x) x/sum(x) * 100)

# Relative abundance Enterococcaceae family:
ps_2_Streptococcaceae = subset_taxa(ps_2_RA, Family == "Streptococcaceae")
plot_abundance = function(physeq,title = "",Facet = "Family", Color = "Family"){
 mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > -1)
  ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
                                              color = Color, fill = Color)) +
    geom_violin(fill = NA) +
    geom_boxplot(fill=NA, width = 0.1) +
    geom_point(size = 5, alpha = 0.3,
               position = position_jitter(width = 0.3)) +
    facet_wrap(facets = Facet) + 
    scale_y_log10()+
    theme(legend.position="none")
}
plot_abundance(ps_2_Streptococcaceae,"") 

# Genus level
ancom3 = ancombc(data = ps_2, assay_name = "counts", 
              tax_level = "Genus", phyloseq = NULL, 
              formula = "Age + Host_Sex + Sample_Type", 
              p_adj_method = "BH", prv_cut = 0.10, lib_cut = 1000, 
              group = "Sample_Type", struc_zero = TRUE, neg_lb = TRUE, tol = 1e-5, 
              max_iter = 100, conserve = TRUE, alpha = 0.05, global = TRUE,
              n_cl = 1, verbose = TRUE)
#ancom3[["res"]]

# is there any genus differ between the groups?
ancom3[["res"]]$diff_abn$taxon [ancom3[["res"]]$diff_abn$Sample_Type==TRUE]
##  [1] "Genus:Shuttleworthia"                
##  [2] "Genus:Sellimonas"                    
##  [3] "Genus:Fournierella"                  
##  [4] "Genus:Ruminococcaceae_UCG-008"       
##  [5] "Genus:Ruminococcaceae_V9D2013_group" 
##  [6] "Genus:Ruminococcaceae_UCG-007"       
##  [7] "Genus:Ruminiclostridium"             
##  [8] "Family:Clostridiales_vadinBB60_group"
##  [9] "Genus:Mitsuokella"                   
## [10] "Genus:Veillonella"                   
## [11] "Genus:Anaerofustis"                  
## [12] "Genus:Parvimonas"                    
## [13] "Order:Izimaplasmatales"              
## [14] "Order:Mollicutes_RF39"               
## [15] "Family:Erysipelotrichaceae"          
## [16] "Genus:Holdemania"                    
## [17] "Genus:Faecalitalea"                  
## [18] "Genus:Merdibacter"                   
## [19] "Genus:Methanosphaera"                
## [20] "Genus:Succinivibrio"                 
## [21] "Genus:Enterobacter"                  
## [22] "Order:Bacteroidales"                 
## [23] "Genus:Prevotellaceae_UCG-001"        
## [24] "Genus:Prevotella_2"                  
## [25] "Genus:Prevotellaceae_NK3B31_group"   
## [26] "Family:Barnesiellaceae"              
## [27] "Family:Saccharimonadaceae"           
## [28] "Genus:F0332"                         
## [29] "Genus:Libanicoccus"                  
## [30] "Genus:Coriobacteriaceae_UCG-003"     
## [31] "Genus:Lachnospiraceae_NK4B4_group"   
## [32] "Genus:Lachnospiraceae_UCG-008"       
## [33] "Genus:Lactonifactor"                 
## [34] "Genus:UC5-1-2E3"                     
## [35] "Genus:Hungatella"                    
## [36] "Genus:Butyrivibrio"                  
## [37] "Genus:Lachnospiraceae_UCG-003"
# Relative abundance Enterococcus genus:
ps_2_Mitsuokella = subset_taxa(ps_2_RA, Genus == "Mitsuokella")
plot_abundance = function(physeq,title = "",Facet = "Genus", Color = "Genus"){
 mphyseq = psmelt(physeq)
mphyseq <- subset(mphyseq, Abundance > -1)
  ggplot(data = mphyseq, mapping = aes_string(x = "Sample_Type",y = "Abundance",
                                              color = Color, fill = Color)) +
    geom_violin(fill = NA) +
    geom_boxplot(fill=NA, width = 0.1) +
    geom_point(size = 5, alpha = 0.3,
               position = position_jitter(width = 0.3)) +
    facet_wrap(facets = Facet) + scale_y_log10()+
    theme(legend.position="none")
}
plot_abundance(ps_2_Mitsuokella,"") 

# Relative abundance Enterococcus genus:
ps_2_Atopobium = subset_taxa(ps_2_RA, Genus == "Shuttleworthia")
plot_abundance(ps_2_Atopobium,"") 

# Relative abundance Enterococcus genus:
ps_2_Tyzzerella_4 = subset_taxa(ps_2_RA, Genus == "Sellimonas")
plot_abundance(ps_2_Tyzzerella_4,"") 

# Relative abundance Enterococcus genus:
ps_2_Peptostreptococcus = subset_taxa(ps_2_RA, Genus == "Fournierella")
plot_abundance(ps_2_Peptostreptococcus,"") 

# Relative abundance Enterococcus genus:
ps_2_Lactonifactor = subset_taxa(ps_2_RA, Genus == "Ruminococcaceae_UCG-008")
plot_abundance(ps_2_Lactonifactor,"") 

10 Repeated Measures

When dealing with more complex designs we need to account for nesting of samples. In this dataset when we want to compare the changes over time we need to account for the pairing of samples from each of the subjects. For univariate testing you could use paired tests like t.test or wilcoxon. However to allow for more complex designs we will use linear mixed models.

ps_paired <- prune_samples(ps@sam_data$Sample_Type!="Donor", ps)
ps_paired <- rarefy_even_depth(ps_paired)

10.1 Alpha Diversity

df.ad <- cbind(estimate_richness(ps_paired, measures = c("Observed","Shannon")), ps_paired@sam_data %>% data.frame())
df.ad$FPD <- picante::pd(samp = as.matrix(unclass(otu_table(ps_paired))), tree = phy_tree(ps_paired), include.root = F)[gsub("X","",rownames(df.ad)),"PD"]
df.ad.long <- df.ad %>% pivot_longer(cols = c("Shannon","Observed","FPD"), names_to = "Diversity_metric", values_to = "Diversity")

diversity_names <- c(
  `Observed` = "Observed",
  `Shannon` = "Shannon",
  `FPD` = "Phylogenetic Diversity")

p.adiv.f20 <- ggplot(df.ad.long, aes(x=Time_Point, y=Diversity, group=Sample_Type, color=Sample_Type)) + 
  #geom_boxplot(outlier.colour = NA, aes(group=Time_Point)) + 
  ggbeeswarm::geom_beeswarm(size=2, cex = 1,priority = "random") + 
  #facet_grid(variable~Host_Body_Site, scale="free_y", labeller = as_labeller(diversity_names)) +
  facet_grid(Diversity_metric~., scale="free_y") +
  scale_colour_viridis_d(option = "B", begin = 0.25, end = 0.75) + 
  #scale_fill_viridis_d(option = "B", begin = 0.25, end = 0.75, alpha = 1) + 
  geom_line(aes(group=Subject_ID), alpha=0.2) + 
  labs(x=NULL,y=NULL)  + 
  theme_bw() + 
  geom_smooth(method = "lm") + 
  stat_cor() + 
  #geom_line(aes(group=Subject_ID), color="black") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) + 
  #stat_compare_means() + 
  NULL; plot(p.adiv.f20)

summary(lmer(Shannon~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Shannon ~ Sample_Type + (1 | Subject_ID)
##    Data: df.ad
## 
## REML criterion at convergence: 28.7
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.87855 -0.55200  0.02058  0.63760  1.75162 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  Subject_ID (Intercept) 0.05543  0.2354  
##  Residual               0.04457  0.2111  
## Number of obs: 64, groups:  Subject_ID, 32
## 
## Fixed effects:
##                    Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)         4.59445    0.05590 47.42897  82.189   <2e-16 ***
## Sample_TypePostFMT -0.07907    0.05278 31.00000  -1.498    0.144    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Smpl_TyPFMT -0.472
summary(lmer(Observed~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Observed ~ Sample_Type + (1 | Subject_ID)
##    Data: df.ad
## 
## REML criterion at convergence: 712.1
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.83428 -0.51014 -0.04003  0.53135  2.01376 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  Subject_ID (Intercept) 4435     66.59   
##  Residual               2317     48.14   
## Number of obs: 64, groups:  Subject_ID, 32
## 
## Fixed effects:
##                    Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)          464.94      14.53  43.31  32.008  < 2e-16 ***
## Sample_TypePostFMT    66.66      12.03  31.00   5.539 4.58e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Smpl_TyPFMT -0.414
summary(lmer(FPD~Sample_Type + (1 | Subject_ID), df.ad))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: FPD ~ Sample_Type + (1 | Subject_ID)
##    Data: df.ad
## 
## REML criterion at convergence: 419.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.5162 -0.4420 -0.1168  0.3515  2.0492 
## 
## Random effects:
##  Groups     Name        Variance Std.Dev.
##  Subject_ID (Intercept) 35.89    5.991   
##  Residual               22.05    4.696   
## Number of obs: 64, groups:  Subject_ID, 32
## 
## Fixed effects:
##                    Estimate Std. Error     df t value Pr(>|t|)    
## (Intercept)          42.745      1.346 44.809  31.766  < 2e-16 ***
## Sample_TypePostFMT    4.624      1.174 31.000   3.939 0.000433 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr)
## Smpl_TyPFMT -0.436

10.2 Beta diversity

We can account for the inter individual variance by what is called multilevel PCA. This allows us to only focus on the variance over the two time points.

diverse.pca = pca(ps_paired@otu_table+1, ncomp = 10, logratio = 'CLR', multilevel = ps_paired@sam_data$Subject_ID)
plot(diverse.pca)

time="Time_Point"
var="Sample_Type"
x="PC1" # first component
y="PC2" # second component

#build df
variates <- diverse.pca$variates$X
df <- data.frame(row.names = rownames(variates),
                 variates,
                 ps_paired@sam_data[rownames(variates),])

# include centroids
df$PTV_TP <- paste0(df[,var],df[,time])
centroids <- aggregate(variates~PTV_TP,data=df,mean)
df <- merge(df,centroids,by="PTV_TP",suffixes=c("",".centroid"))

# plot
p.multi.pca <- ggplot(df, aes_string(x=x, y=y, color=var, shape=time)) +
  geom_segment(aes_string(x=paste0(x,".centroid"), y=paste0(y,".centroid"), xend=x, yend=y), alpha=0.3, color="black") +
  #  scale_color_viridis_d(option = "C", end = 0.75) +
  scale_color_manual(values = RColorBrewer::brewer.pal(n = 5, name = "Set1")) + 
  theme_bw() +
  labs(x=paste0(x,": ",round(diverse.pca$prop_expl_var$X[x], 3)*100, "% variance explained"),
       y=paste0(y,": ",round(diverse.pca$prop_expl_var$X[y], 3)*100, "% variance explained"),
       title="Multilevel PCA") +
  #facet_wrap(~Time_Point) + 
  geom_point(size=3) +
  ggrepel::geom_text_repel(aes(label=Subject_ID)) + 
  NULL; plot(p.multi.pca)

df <- data.frame(row.names = rownames(variates),
                 variates,
                 ps_paired@sam_data[rownames(variates),])

summary(manova(variates[,1:2]~df$Time_Point*df$Sample_Type))
##               Df  Pillai approx F num Df den Df    Pr(>F)    
## df$Time_Point  1 0.58392   42.804      2     61 2.425e-12 ***
## Residuals     62                                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(aov(variates~df$Time_Point*df$Sample_Type))
##  Response PC1 :
##               Df Sum Sq Mean Sq F value   Pr(>F)    
## df$Time_Point  1 2125.5 2125.51  60.081 1.07e-10 ***
## Residuals     62 2193.4   35.38                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response PC2 :
##               Df  Sum Sq Mean Sq F value  Pr(>F)  
## df$Time_Point  1  195.22 195.225  6.2657 0.01496 *
## Residuals     62 1931.79  31.158                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response PC3 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1   48.65  48.647  1.5516 0.2176
## Residuals     62 1943.86  31.353               
## 
##  Response PC4 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1    1.05  1.0504  0.0402 0.8418
## Residuals     62 1621.69 26.1563               
## 
##  Response PC5 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1    0.01  0.0085   4e-04  0.985
## Residuals     62 1477.82 23.8358               
## 
##  Response PC6 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1    9.78  9.7752  0.4539  0.503
## Residuals     62 1335.13 21.5343               
## 
##  Response PC7 :
##               Df  Sum Sq Mean Sq F value  Pr(>F)  
## df$Time_Point  1   52.61  52.607  2.8087 0.09879 .
## Residuals     62 1161.29  18.730                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##  Response PC8 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1   30.32  30.321  1.6368 0.2055
## Residuals     62 1148.51  18.524               
## 
##  Response PC9 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1   19.76  19.756    1.14 0.2898
## Residuals     62 1074.48  17.330               
## 
##  Response PC10 :
##               Df  Sum Sq Mean Sq F value Pr(>F)
## df$Time_Point  1   24.34  24.344  1.4379  0.235
## Residuals     62 1049.68  16.930

We also can account for it in the permanova by using the stata argument

d <- phyloseq::distance(ps_paired, "bray")
adonis2(d ~ ps_paired@sam_data$Sample_Type, strata = ps_paired@sam_data$Subject_ID)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks:  strata 
## Permutation: free
## Number of permutations: 999
## 
## adonis2(formula = d ~ ps_paired@sam_data$Sample_Type, strata = ps_paired@sam_data$Subject_ID)
##                                Df SumOfSqs      R2      F Pr(>F)    
## ps_paired@sam_data$Sample_Type  1   0.2355 0.01763 1.1126  0.001 ***
## Residual                       62  13.1212 0.98237                  
## Total                          63  13.3567 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

10.3 Taxa

We again can use lmer to test differences in taxa abundance. Note due to multiple testing burden it is recommended to reduced the set of taxa that get tested. Here we will only test the top 100 most abundant taxa.

lmerres <- function(asv, ps){
  psm <- psmelt(prune_taxa(asv, ps))
  lmerTest::lmer(Abundance ~ Sample_Type + (1|Subject_ID), psm)
}

taxa <- taxa_sums(ps_paired) %>% sort(decreasing = T) %>% head(250) %>% names()

asv_lmermodels <- lapply(taxa, function(x) lmerres(x, ps_paired))

df <- data.frame(taxa=taxa,
                 pvals = unlist(lapply(asv_lmermodels, function(x) coefficients(summary(x))[2,5])),
                 ps_paired@tax_table[taxa,]
)

df$padj <- p.adjust(df$pvals, method = "fdr")
df[df$padj<0.05,]
##                taxa        pvals  Kingdom     Phylum      Class         Order
## ASVX_7703 ASVX_7703 3.239364e-06 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_568   ASVX_568 6.912786e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_1335 ASVX_1335 4.047797e-05 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4346 ASVX_4346 3.929851e-05 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4915 ASVX_4915 1.244758e-03 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_2278 ASVX_2278 6.617822e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_6835 ASVX_6835 2.810065e-04 Bacteria Firmicutes Clostridia Clostridiales
## ASVX_4239 ASVX_4239 4.482078e-05 Bacteria Firmicutes Clostridia Clostridiales
##                    Family                        Genus Species         padj
## ASVX_7703 Lachnospiraceae                        Dorea    <NA> 0.0008098409
## ASVX_568  Lachnospiraceae                 Agathobacter    <NA> 0.0246885225
## ASVX_1335 Lachnospiraceae                      Blautia    <NA> 0.0028012987
## ASVX_4346 Lachnospiraceae                 Agathobacter    <NA> 0.0028012987
## ASVX_4915 Lachnospiraceae Lachnospiraceae_ND3007_group    <NA> 0.0388986868
## ASVX_2278 Lachnospiraceae                        Dorea    <NA> 0.0246885225
## ASVX_6835 Lachnospiraceae                      Blautia    <NA> 0.0140503265
## ASVX_4239 Lachnospiraceae                    Roseburia    <NA> 0.0028012987
asv <- df[df$padj<0.05,] %>% rownames()
psm <- psmelt(prune_taxa(asv, ps_paired))
ggplot(psm, aes(x=Sample_Type, y=Abundance)) +
  ggbeeswarm::geom_beeswarm() +
  geom_boxplot() + 
  facet_wrap(~OTU+Genus, scales="free") 

11 Comparing microbiome of patients to the donor

To test if there is any engraftment of species we can test if the distance of the patients is reduced after FMT.

ps_rare <- rarefy_even_depth(ps)
d <- distance(ps_rare, "jaccard", binary=T)
vegan::adonis2(d~ps_rare@sam_data$Sample_Type)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
## 
## vegan::adonis2(formula = d ~ ps_rare@sam_data$Sample_Type)
##                              Df SumOfSqs      R2      F Pr(>F)    
## ps_rare@sam_data$Sample_Type  2   0.7578 0.04058 1.5439  0.001 ***
## Residual                     73  17.9161 0.95942                  
## Total                        75  18.6740 1.00000                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
df <- reshape2::melt(as.matrix(d)) # we convert the matrix to long format (each index becomes a row)

df$Var1_Donor_ID <- ps_rare@sam_data[df$Var1,]$Donor_Sample_ID
df$Var2_Donor_ID <- ps_rare@sam_data[df$Var2,]$Donor_Sample_ID

df$Var1_Sample_Type <- ps_rare@sam_data[df$Var1,]$Sample_Type
df$Var2_Sample_Type <- ps_rare@sam_data[df$Var2,]$Sample_Type

df <- df[df$Var1_Sample_Type!="Donor",] # We only want to compare distance between recipient/donors
df <- df[df$Var2_Sample_Type=="Donor",]

df$Donor_match <- df$Var1_Donor_ID==df$Var2_Donor_ID # check if the comparison is within or between

ggplot(df, aes(x=Donor_match, y=value)) +
  facet_wrap(~Var1_Sample_Type) + 
  geom_jitter() +
  stat_compare_means() + 
  NULL

12 Including clinical outcomes